Credit Scoring

Carlos Manchini

2024-02-28

Base de Dados

Análise Exploratória

Target e Médias das Variáveis Preditoras

y Contagem Prop VAR_1 VAR_2 VAR_3 VAR_4 VAR_5 VAR_6 VAR_7 VAR_8 VAR_9 VAR_10 VAR_11 VAR_12 VAR_13 VAR_14 VAR_15 VAR_16 VAR_17 VAR_18 VAR_19 VAR_20 VAR_21 VAR_22 VAR_23 VAR_24 VAR_25 VAR_26 VAR_27 VAR_28 VAR_29 VAR_30 VAR_31 VAR_32 VAR_33 VAR_34 VAR_35 VAR_36 VAR_37 VAR_38 VAR_39 VAR_40 VAR_41 VAR_42 VAR_43 VAR_44 VAR_45 VAR_46 VAR_47 VAR_48 VAR_49 VAR_50 VAR_51 VAR_52 VAR_53 VAR_54 VAR_55 VAR_56 VAR_57 VAR_58 VAR_59 VAR_60 VAR_61 VAR_62 VAR_63 VAR_64 VAR_65 VAR_66 VAR_67 VAR_68 VAR_69 VAR_70 VAR_71 VAR_72 VAR_73 VAR_74 VAR_75 VAR_76 VAR_77 VAR_78
0 7610 70.87 22.9 0.2 0.2 0.5 93.5 1664.2 67.4 69.5 1107.4 462.9 69.1 462.3 90.6 630.5 66.2 89.4 2492.5 58.3 9.2 10.1 1289.1 12.0 128.6 380.7 2.7 380.7 428.4 3.3 6.3 1681.2 14.9 0.1 2.9 289.3 285.5 3.5 4.4 274.5 326.0 3.6 3.6 8.7 18.9 4.5 373.4 10.1 74.5 1.4 3.5 200.5 369.5 268.7 2359.3 1810.5 199.2 202.4 47.1 333.2 315.0 0.0 3.2 1262.4 194.0 0.6 1330.3 17.8 -222.6 -0.8 531.4 4.5 0.4 176.1 0.4 0.5 629.4 480.6 64.8 0.5
1 3128 29.13 57.1 0.8 0.5 0.8 90.2 1064.0 68.9 74.5 1276.0 448.6 69.8 437.7 88.1 515.3 66.2 85.1 2044.1 52.5 4.5 11.2 1128.2 6.2 126.0 367.1 1.4 266.1 377.0 1.9 12.0 1796.7 9.0 0.1 1.7 322.6 296.8 4.9 10.6 286.4 314.5 2.3 5.2 15.7 17.6 3.1 363.5 18.5 85.2 8.2 5.4 201.4 339.5 269.7 2302.5 1775.9 198.4 203.3 43.3 318.3 290.6 -0.1 4.3 1221.7 195.5 0.7 925.8 17.2 -183.5 5.0 517.6 6.2 0.6 131.3 0.6 0.9 655.1 464.5 65.5 0.9
set.seed(369) 
index <- caret::createDataPartition(data$y, p = 0.7, list = FALSE) 
train0 <- data[index, ] ; cat(train0$y %>% table %>% prop.table*100)
70.97246 29.02754
test0 <- data[-index, ] ; cat(test0$y %>% table %>% prop.table*100)
70.63024 29.36976

Expandir código
library(plotly)

ts_y <- ggplot(data, aes(x = AnoMes, fill = factor(y))) +
  geom_bar(position = "dodge", width = 25) +
  scale_fill_manual(values = c("darkgrey", "springgreen3"), labels = c("0", "1")) +
  labs(x = "Data", y = "Número de Casos", title = "Série Temporal - Target - 2014", fill = "Target") +
  scale_x_date(date_labels = "%b", date_breaks = "1 month") 

ggplotly(ts_y, height = 400, align="center")

Variáveis

Proporção de Valores Ausentes e Information Value (IV)

Variáveis

Expandir código
# Gráfico VAR_1
plot_VAR_1 <- ggplot(aggregate(VAR_1 ~ AnoMes, data = data, FUN = sum), aes(x = AnoMes, y = VAR_1)) +
  geom_line(color="springgreen3", size=1.1) + labs(x = "", y = "VAR_1", title = "Comportamento das Variáveis VAR_1:3") +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")
pl1 <- ggplotly(plot_VAR_1)
  
# Gráfico VAR_2
plot_VAR_2 <- ggplot(aggregate(VAR_2 ~ AnoMes, data = data, FUN = sum), aes(x = AnoMes, y = VAR_2)) +
  geom_line(color="springgreen3", size=1.1) + labs(x = "", y = "VAR_2") +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")
pl2 <- ggplotly(plot_VAR_2)

# Gráfico VAR_3
plot_VAR_3 <- ggplot(aggregate(VAR_3 ~ AnoMes, data = data, FUN = sum), aes(x = AnoMes, y = VAR_3)) +
  geom_line(color="springgreen3", size=1.1) + labs(x = "", y = "VAR_3") +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")
pl3 <- ggplotly(plot_VAR_3)

subplot(pl1, pl2, pl3, nrows = 3, shareY = FALSE, margin = .05, titleY = T, widths = .99, heights = c(.33,.33,.33))

Seleção (Correlação)

Expandir código
bestiv <- IV$Summary[IV$Summary$IV >= 0.1, ]$Variable

train <- train0 %>% subset(select = c("id","y", bestiv))
test <- test0 %>% subset(select = c("y",bestiv))

corrplot::corrplot(cor(train %>% select(id, y, bestiv[1:20]), use = "pairwise.complete.obs"),
                   method = "circle",
                   type = "upper",
                   diag = TRUE, 
                   tl.col = "black")

Seleção (Clusterização)

Expandir código
library(ClustOfVar)

dendro <- hclustvar(X.quanti = as.data.frame(train[,!(names(train) %in% c("id","y"))])) 
plot(dendro, type="tree")

Seleção (Clusterização)

Expandir código
library(reshape2) ; library(plyr)

nvars <- length(dendro[dendro$height<0.5]) #critério agregacao
part_init <- cutreevar(dendro, nvars)$cluster
kmeans <- kmeansvar(X.quanti = as.data.frame(train[,!(names(train) %in% c("id","y"))]),init = part_init)
clusters <- cbind.data.frame(melt(kmeans$cluster), row.names(melt(kmeans$cluster)))
names(clusters) <- c("Cluster", "Variable")
clusters <- join(clusters, IV$Summary, by="Variable", type="left")
clusters <- clusters[order(clusters$Cluster),]
clusters$Rank <- stats::ave(-clusters$AdjIV, clusters$Cluster, FUN=rank)
VARIAVEISclust <- subset(clusters, Rank==1) #%in% 1:2

DT::datatable(clusters %>% select(Cluster, Variable, IV, Rank), 
              width = 559,
              options = list(pageLength = 15, columnDefs = list(list(className = 'dt-center', targets = 1:3))))

Seleção Final

Expandir código
bestiv_clust <- VARIAVEISclust$Variable 

train2 <- train %>% select("y", all_of(bestiv_clust))
test2 <- test %>% select("y", all_of(bestiv_clust))

corrplot::corrplot(cor(train2, use = "pairwise.complete.obs"), 
                   method = "circle", 
                   type = "upper", 
                   diag = TRUE, 
                   tl.col = "black", 
                   addCoef.col = "black", 
                   number.cex = .6)

Binning

Categorização automática

Expandir código
library(scorecard)

bin1 <- woebin(train2, y="y")
plotsbin <- woebin_plot(bin1, line_color='blue', bar_color=c('gray', 'springgreen3'))
plotsbinwoe <- woebin_plot(bin1, line_value = "woe", line_color='blue', bar_color=c('springgreen3','gray'))
ggplotly(plotsbinwoe$VAR_1)

Definição Manual

Expandir código
{breaks <- list(
  VAR_1=c("missing%,%5", "30", "80", "140"), 
  VAR_6=c("missing%,%300", "800", "1400"), 
  VAR_20=c("5", "9", "11"),
  VAR_47=c("missing","20"),
  VAR_32=c("0.09063413815", "0.1338329127"),
  VAR_19=c("missing%,%1", "13", "23" ),
  VAR_18=c("missing","15", "25", "65"), 
  VAR_17=c("missing","1000", "1400", "3200", "4800"), 
  VAR_43=c("missing", "8", "28"),
  VAR_65=c("missing","500", "1100", "1900"),
  VAR_37=c("missing","2", "6", "20"),
  VAR_77=c("missing", "10"),
  VAR_57=c("30", "41", "46", "53"),
  VAR_14=c("missing","350", "650", "1100"),
  VAR_78=c("missing","1")
)}

bin_adj <- woebin(train2, y="y", positive=1, breaks_list = breaks)
bin_adj_test <- woebin(test2, y="y", positive=1, breaks_list = breaks)

plotsbin_adj <- woebin_plot(bin_adj, line_color='blue', bar_color=c('springgreen3','gray'))
plotsbin_adjwoe <- woebin_plot(bin_adj, line_value = "woe", line_color='blue', bar_color=c('springgreen3','gray'))

ggplotly(plotsbin_adjwoe$VAR_1)
variable bin count count_distr neg pos posprob woe bin_iv
VAR_1 [-Inf,5)%,%missing 3048 0.4054809 2449 599 0.1965223 -0.5141265 0.0948693
VAR_1 [5,30) 2219 0.2951976 1703 516 0.2325372 -0.2999929 0.0248191
VAR_1 [30,80) 1268 0.1686843 838 430 0.3391167 0.2268141 0.0090705
VAR_1 [80,140) 487 0.0647865 236 251 0.5154004 0.9556681 0.0676574
VAR_1 [140, Inf) 495 0.0658507 109 386 0.7797980 2.1585365 0.3377480

Binning

Categorização automática

Expandir código
ggplotly(plotsbinwoe$VAR_37)

Definição Manual

Expandir código
ggplotly(plotsbin_adjwoe$VAR_37)
variable bin count count_distr neg pos posprob woe bin_iv
VAR_37 missing 4260 0.5667154 2893 1367 0.3208920 0.1443715 0.0121592
VAR_37 [-Inf,2) 1298 0.1726753 1077 221 0.1702619 -0.6897250 0.0693803
VAR_37 [2,6) 600 0.0798191 459 141 0.2350000 -0.2862433 0.0061302
VAR_37 [6,20) 827 0.1100173 582 245 0.2962515 0.0288348 0.0000920
VAR_37 [20, Inf) 532 0.0707729 324 208 0.3909774 0.4508416 0.0155966

Modelo

Estimate Std. Error z value Pr(>|z|) OddsRatio Chance
(Intercept) -0.889 0.029 -30.753 0.000 NA NA
VAR_1_woe 1.021 0.042 24.298 0.000 2.775969 177.59693
VAR_6_woe 0.282 0.136 2.076 0.038 1.325779 32.57787
VAR_20_woe 1.310 0.079 16.616 0.000 3.706174 270.61737
VAR_47_woe 0.223 0.119 1.875 0.061 1.249821 24.98206
VAR_19_woe 0.286 0.133 2.149 0.032 1.331093 33.10925
VAR_17_woe 0.377 0.094 4.019 0.000 1.457904 45.79043
VAR_37_woe 0.503 0.096 5.249 0.000 1.653675 65.36749
VAR_77_woe 0.570 0.117 4.871 0.000 1.768267 76.82671
VAR_57_woe 0.692 0.087 7.915 0.000 1.997707 99.77070




Dados MSE RMSE LogLoss R2 KS AUC Gini
Treino 0.1644 0.4054 0.4988 0.2021 0.3835 0.7644 0.5288
Teste 0.1700 0.4123 0.5131 0.1806 0.3909 0.7567 0.5134

Diagnóstico

Validação Cruzada

KS

datset train validation
1 0.3793175 0.4577518
10 0.3880372 0.3726454
2 0.3835985 0.4069363
3 0.3820137 0.4271099
4 0.3871181 0.3892653
5 0.3881241 0.3948260
6 0.3796413 0.4647385
7 0.3843862 0.3991380
8 0.3947458 0.3351765
9 0.3865076 0.3680718

AUC

datset train validation
1 0.7606399 0.7965824
10 0.7649281 0.7601584
2 0.7624011 0.7820006
3 0.7626216 0.7819101
4 0.7648676 0.7601895
5 0.7660196 0.7498796
6 0.7623814 0.7871096
7 0.7662695 0.7457885
8 0.7698920 0.7113235
9 0.7658792 0.7521899

MSE

datset train validation
1 0.1647146 0.1614745
10 0.1645354 0.1632099
2 0.1646252 0.1620947
3 0.1656327 0.1522315
4 0.1645093 0.1632295
5 0.1642816 0.1651660
6 0.1653753 0.1550888
7 0.1644799 0.1635406
8 0.1623144 0.1849546
9 0.1627372 0.1791463


Teste de Goldfeld-Quandt (Homocedasticidade)


    Goldfeld-Quandt test

data:  logstep
GQ = 1.0007, df1 = 3749, df2 = 3748, p-value = 0.4919
alternative hypothesis: variance increases from segment 1 to 2

VIF (Variance Inflation Factor)

     variable     gvif
1:  VAR_1_woe 1.062664
2:  VAR_6_woe 3.883454
3: VAR_20_woe 1.049868
4: VAR_47_woe 2.232550
5: VAR_19_woe 2.785666
6: VAR_17_woe 1.178274
7: VAR_37_woe 1.083164
8: VAR_77_woe 1.769231
9: VAR_57_woe 1.057920

Matriz de Confusão

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 2156  681
         1  119  265
                                          
               Accuracy : 0.7516          
                 95% CI : (0.7363, 0.7665)
    No Information Rate : 0.7063          
    P-Value [Acc > NIR] : 5.366e-09       
                                          
                  Kappa : 0.2757          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9477          
            Specificity : 0.2801          
         Pos Pred Value : 0.7600          
         Neg Pred Value : 0.6901          
             Prevalence : 0.7063          
         Detection Rate : 0.6694          
   Detection Prevalence : 0.8808          
      Balanced Accuracy : 0.6139          
                                          
       'Positive' Class : 0               
                                          

Escoragem

Expandir código
card <- scorecard(bin_adj, logstep, points0 = 600, odds0 = 1/20, pdo=20)

exp_eta <- exp(predict(logstep)) 
factor <- 20/log(2) 
offset <- 600-factor*log(20) 

data.frame(odds=exp_eta, 
           prob_calc=round(exp_eta/(exp_eta+1),2), 
           prob_pred=round(train_pred,2),
           scorecard = scorecard_ply(train2, card, print_step = 0), 
           score_calc = round(offset-factor*predict(logstep),2)) %>% 
  DT::datatable(width = 599,
                options = list(pageLength = 8, searching = FALSE, lengthChange = FALSE))

–>

–>

–>

–>